Group Project

Import Packages

library("ggplot2")
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.4     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library('geosphere')
library("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.

Importing Data

# Reading in the sample CSV of rider data we made
rider_2019_sample <- read.csv('sample.csv', stringsAsFactors = TRUE)
head(rider_2019_sample)
##   tripduration                starttime                 stoptime
## 1          564 2019-04-11 07:44:36.0580 2019-04-11 07:54:00.1840
## 2         1158 2019-05-15 18:00:33.3890 2019-05-15 18:19:52.0150
## 3          763 2019-03-25 13:27:50.4260 2019-03-25 13:40:33.7960
## 4          915 2019-06-21 15:52:07.8340 2019-06-21 16:07:23.6810
## 5         1368 2019-06-01 18:42:22.9500 2019-06-01 19:05:11.7510
## 6          267 2019-07-31 18:47:05.5630 2019-07-31 18:51:33.0870
##   start.station.id       start.station.name start.station.latitude
## 1             3711       E 13 St & Avenue A               40.72967
## 2             3016        Kent Ave & N 7 St               40.72037
## 3              382  University Pl & E 14 St               40.73493
## 4              359       E 47 St & Park Ave               40.75510
## 5             3295 Central Park W & W 96 St               40.79127
## 6             3377     Carroll St & Bond St               40.67861
##   start.station.longitude end.station.id  end.station.name end.station.latitude
## 1               -73.98068            168   W 18 St & 6 Ave             40.73971
## 2               -73.96165           3016 Kent Ave & N 7 St             40.72037
## 3               -73.99201            459  W 20 St & 11 Ave             40.74674
## 4               -73.97499            483   E 12 St & 3 Ave             40.73223
## 5               -73.96484           3142   1 Ave & E 62 St             40.76123
## 6               -73.99037           3398   Smith St & 9 St             40.67470
##   end.station.longitude bikeid   usertype birth.year gender
## 1             -73.99456  29807 Subscriber       1994      1
## 2             -73.96165  34411 Subscriber       1974      1
## 3             -74.00776  16078 Subscriber       1961      1
## 4             -73.98890  29904 Subscriber       1964      2
## 5             -73.96094  30247   Customer       1969      0
## 6             -73.99786  20315 Subscriber       1971      1
# Reading in the weather data set
weather_data <- read.csv('NYCWeather2019.csv', stringsAsFactors = TRUE)
head(weather_data)
##       STATION                        NAME     DATE AWND PRCP SNOW SNWD TAVG
## 1 USW00094728 NY CITY CENTRAL PARK, NY US 1/1/2019   NA 0.06    0    0   NA
## 2 USW00094728 NY CITY CENTRAL PARK, NY US 1/2/2019   NA 0.00    0    0   NA
## 3 USW00094728 NY CITY CENTRAL PARK, NY US 1/3/2019   NA 0.00    0    0   NA
## 4 USW00094728 NY CITY CENTRAL PARK, NY US 1/4/2019   NA 0.00    0    0   NA
## 5 USW00094728 NY CITY CENTRAL PARK, NY US 1/5/2019   NA 0.50    0    0   NA
## 6 USW00094728 NY CITY CENTRAL PARK, NY US 1/6/2019   NA 0.00    0    0   NA
##   TMAX TMIN
## 1   58   39
## 2   40   35
## 3   44   37
## 4   47   35
## 5   47   41
## 6   49   31

Initial Data Summary

# Initial summary of rider data set
str(rider_2019_sample)
## 'data.frame':    100000 obs. of  15 variables:
##  $ tripduration           : int  564 1158 763 915 1368 267 661 1112 520 512 ...
##  $ starttime              : Factor w/ 99999 levels "2019-01-01 00:56:30.7720",..: 18803 28405 14066 41002 34169 54789 95279 5247 68397 75686 ...
##  $ stoptime               : Factor w/ 100000 levels "2019-01-01 01:34:45.0200",..: 18804 28409 14065 41001 34174 54787 95282 5246 68395 75682 ...
##  $ start.station.id       : Factor w/ 825 levels "116","119","120",..: 621 86 688 538 263 348 749 80 259 545 ...
##  $ start.station.name     : Factor w/ 894 levels "1 Ave & E 110 St",..: 352 545 760 386 250 234 797 672 440 99 ...
##  $ start.station.latitude : num  40.7 40.7 40.7 40.8 40.8 ...
##  $ start.station.longitude: num  -74 -74 -74 -74 -74 ...
##  $ end.station.id         : Factor w/ 828 levels "116","119","120",..: 15 86 752 774 184 369 623 27 333 509 ...
##  $ end.station.name       : Factor w/ 890 levels "1 Ave & E 110 St",..: 793 549 795 350 7 714 787 371 598 92 ...
##  $ end.station.latitude   : num  40.7 40.7 40.7 40.7 40.8 ...
##  $ end.station.longitude  : num  -74 -74 -74 -74 -74 ...
##  $ bikeid                 : int  29807 34411 16078 29904 30247 20315 40128 33989 29972 20897 ...
##  $ usertype               : Factor w/ 2 levels "Customer","Subscriber": 2 2 2 2 1 2 1 2 2 2 ...
##  $ birth.year             : int  1994 1974 1961 1964 1969 1971 1969 1960 1972 1966 ...
##  $ gender                 : int  1 1 1 2 0 1 0 1 1 1 ...
summary(rider_2019_sample)
##   tripduration                          starttime    
##  Min.   :     61.0   2019-11-22 17:59:58.4760:    2  
##  1st Qu.:    362.0   2019-01-01 00:56:30.7720:    1  
##  Median :    614.0   2019-01-01 01:35:30.5010:    1  
##  Mean   :    950.8   2019-01-01 02:04:41.7180:    1  
##  3rd Qu.:   1075.0   2019-01-01 02:25:28.9700:    1  
##  Max.   :2769536.0   2019-01-01 02:33:50.6550:    1  
##                      (Other)                 :99993  
##                      stoptime     start.station.id
##  2019-01-01 01:34:45.0200:    1   519    :  810   
##  2019-01-01 01:51:55.8730:    1   3255   :  617   
##  2019-01-01 02:13:13.4810:    1   497    :  602   
##  2019-01-01 02:29:13.1090:    1   402    :  561   
##  2019-01-01 03:04:23.8640:    1   435    :  551   
##  2019-01-01 04:09:48.6020:    1   (Other):96523   
##  (Other)                 :99994   NA's   :  336   
##              start.station.name start.station.latitude start.station.longitude
##  Pershing Square North:  810    Min.   :40.66          Min.   :-74.03         
##  8 Ave & W 31 St      :  617    1st Qu.:40.72          1st Qu.:-74.00         
##  E 17 St & Broadway   :  602    Median :40.74          Median :-73.98         
##  Broadway & E 22 St   :  561    Mean   :40.74          Mean   :-73.98         
##  W 21 St & 6 Ave      :  551    3rd Qu.:40.76          3rd Qu.:-73.97         
##  Broadway & E 14 St   :  548    Max.   :40.85          Max.   :-73.88         
##  (Other)              :96311                                                  
##  end.station.id               end.station.name end.station.latitude
##  519    :  792   Pershing Square North:  792   Min.   :40.66       
##  402    :  636   Broadway & E 22 St   :  636   1st Qu.:40.72       
##  3255   :  632   8 Ave & W 31 St      :  632   Median :40.74       
##  497    :  623   E 17 St & Broadway   :  623   Mean   :40.74       
##  285    :  547   Broadway & E 14 St   :  547   3rd Qu.:40.76       
##  (Other):96426   W 21 St & 6 Ave      :  544   Max.   :40.86       
##  NA's   :  344   (Other)              :96226                       
##  end.station.longitude     bikeid            usertype       birth.year  
##  Min.   :-74.03        Min.   :14529   Customer  :14054   Min.   :1885  
##  1st Qu.:-74.00        1st Qu.:25346   Subscriber:85946   1st Qu.:1970  
##  Median :-73.99        Median :30918                      Median :1983  
##  Mean   :-73.98        Mean   :29674                      Mean   :1980  
##  3rd Qu.:-73.97        3rd Qu.:35049                      3rd Qu.:1990  
##  Max.   :-73.89        Max.   :42046                      Max.   :2003  
##                                                                         
##      gender     
##  Min.   :0.000  
##  1st Qu.:1.000  
##  Median :1.000  
##  Mean   :1.161  
##  3rd Qu.:1.000  
##  Max.   :2.000  
## 
# Initial summart of weather data set
str(weather_data)
## 'data.frame':    365 obs. of  10 variables:
##  $ STATION: Factor w/ 1 level "USW00094728": 1 1 1 1 1 1 1 1 1 1 ...
##  $ NAME   : Factor w/ 1 level "NY CITY CENTRAL PARK, NY US": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DATE   : Factor w/ 365 levels "1/1/2019","1/10/2019",..: 1 12 23 26 27 28 29 30 31 2 ...
##  $ AWND   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PRCP   : num  0.06 0 0 0 0.5 0 0 0.17 0.06 0 ...
##  $ SNOW   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SNWD   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ TAVG   : logi  NA NA NA NA NA NA ...
##  $ TMAX   : int  58 40 44 47 47 49 34 45 45 34 ...
##  $ TMIN   : int  39 35 37 35 41 31 25 34 34 28 ...
summary(rider_2019_sample)
##   tripduration                          starttime    
##  Min.   :     61.0   2019-11-22 17:59:58.4760:    2  
##  1st Qu.:    362.0   2019-01-01 00:56:30.7720:    1  
##  Median :    614.0   2019-01-01 01:35:30.5010:    1  
##  Mean   :    950.8   2019-01-01 02:04:41.7180:    1  
##  3rd Qu.:   1075.0   2019-01-01 02:25:28.9700:    1  
##  Max.   :2769536.0   2019-01-01 02:33:50.6550:    1  
##                      (Other)                 :99993  
##                      stoptime     start.station.id
##  2019-01-01 01:34:45.0200:    1   519    :  810   
##  2019-01-01 01:51:55.8730:    1   3255   :  617   
##  2019-01-01 02:13:13.4810:    1   497    :  602   
##  2019-01-01 02:29:13.1090:    1   402    :  561   
##  2019-01-01 03:04:23.8640:    1   435    :  551   
##  2019-01-01 04:09:48.6020:    1   (Other):96523   
##  (Other)                 :99994   NA's   :  336   
##              start.station.name start.station.latitude start.station.longitude
##  Pershing Square North:  810    Min.   :40.66          Min.   :-74.03         
##  8 Ave & W 31 St      :  617    1st Qu.:40.72          1st Qu.:-74.00         
##  E 17 St & Broadway   :  602    Median :40.74          Median :-73.98         
##  Broadway & E 22 St   :  561    Mean   :40.74          Mean   :-73.98         
##  W 21 St & 6 Ave      :  551    3rd Qu.:40.76          3rd Qu.:-73.97         
##  Broadway & E 14 St   :  548    Max.   :40.85          Max.   :-73.88         
##  (Other)              :96311                                                  
##  end.station.id               end.station.name end.station.latitude
##  519    :  792   Pershing Square North:  792   Min.   :40.66       
##  402    :  636   Broadway & E 22 St   :  636   1st Qu.:40.72       
##  3255   :  632   8 Ave & W 31 St      :  632   Median :40.74       
##  497    :  623   E 17 St & Broadway   :  623   Mean   :40.74       
##  285    :  547   Broadway & E 14 St   :  547   3rd Qu.:40.76       
##  (Other):96426   W 21 St & 6 Ave      :  544   Max.   :40.86       
##  NA's   :  344   (Other)              :96226                       
##  end.station.longitude     bikeid            usertype       birth.year  
##  Min.   :-74.03        Min.   :14529   Customer  :14054   Min.   :1885  
##  1st Qu.:-74.00        1st Qu.:25346   Subscriber:85946   1st Qu.:1970  
##  Median :-73.99        Median :30918                      Median :1983  
##  Mean   :-73.98        Mean   :29674                      Mean   :1980  
##  3rd Qu.:-73.97        3rd Qu.:35049                      3rd Qu.:1990  
##  Max.   :-73.89        Max.   :42046                      Max.   :2003  
##                                                                         
##      gender     
##  Min.   :0.000  
##  1st Qu.:1.000  
##  Median :1.000  
##  Mean   :1.161  
##  3rd Qu.:1.000  
##  Max.   :2.000  
## 

Adjusting Dates in Data Sets

# Creating columns of just month, day, and year
weather_data$DATE <- as.Date(weather_data$DATE, format = "%m/%d/%Y")
weather_data$Month <- format(weather_data$DATE, "%m")
weather_data$Day <- format(weather_data$DATE, "%d")
weather_data$Year <- format(weather_data$DATE, "%Y")
# Creating columns of just month, day, and year
rider_2019_sample$DATE <- as.Date(rider_2019_sample$starttime, format = "%Y-%m-%d")
rider_2019_sample$Month <- format(rider_2019_sample$DATE, "%m")
rider_2019_sample$Day <- format(rider_2019_sample$DATE, "%d")
rider_2019_sample$Year <- format(rider_2019_sample$DATE, "%Y")

Rider Age

rider_2019_sample$age <- 2019 - as.numeric(as.character(rider_2019_sample$birth.year))
rider_2019_sample <- filter(rider_2019_sample, age <= 80)

Combining Data Sets

# Combining data frames to compare data
edited_weather <- select(weather_data,
                         PRCP,
                         SNOW,
                         AWND,
                         DATE)
edited_rider <- select(rider_2019_sample, 
                       age,
                       gender,
                       usertype,
                       tripduration,
                       start.station.latitude,
                       start.station.longitude,
                       start.station.id,
                       start.station.name,
                       end.station.latitude,
                       end.station.longitude,
                       end.station.id,
                       end.station.name,
                       DATE,
                       Day,
                       Month,
                       Year)

total_data = merge(edited_weather, edited_rider, by.x="DATE", by.y="DATE", all.x=TRUE)
head(total_data)
##         DATE PRCP SNOW AWND age gender   usertype tripduration
## 1 2019-01-01 0.06    0   NA  52      1 Subscriber         1166
## 2 2019-01-01 0.06    0   NA  33      1 Subscriber          532
## 3 2019-01-01 0.06    0   NA  55      1 Subscriber          263
## 4 2019-01-01 0.06    0   NA  29      1 Subscriber          196
## 5 2019-01-01 0.06    0   NA  28      1 Subscriber          710
## 6 2019-01-01 0.06    0   NA  37      2 Subscriber          312
##   start.station.latitude start.station.longitude start.station.id
## 1               40.72037               -73.96165             3016
## 2               40.67583               -73.95617             3569
## 3               40.74517               -73.98683              474
## 4               40.72308               -73.98584             3656
## 5               40.75187               -73.97771              519
## 6               40.71422               -73.98135              502
##            start.station.name end.station.latitude end.station.longitude
## 1           Kent Ave & N 7 St             40.72080             -73.95485
## 2 Franklin Ave & St Marks Ave             40.69073             -73.95133
## 3             5 Ave & E 29 St             40.74034             -73.98955
## 4           E 2 St & Avenue A             40.72087             -73.98086
## 5       Pershing Square North             40.73222             -73.98166
## 6         Henry St & Grand St             40.72217             -73.98369
##   end.station.id             end.station.name Day Month Year
## 1           3101        N 12 St & Bedford Ave  01    01 2019
## 2           3056 Kosciuszko St & Nostrand Ave  01    01 2019
## 3            402           Broadway & E 22 St  01    01 2019
## 4            150            E 2 St & Avenue C  01    01 2019
## 5            504              1 Ave & E 16 St  01    01 2019
## 6            301            E 2 St & Avenue B  01    01 2019

Initial Data Analysis

Gender Split in Riders

# Reclassifying the genders
# 0=unknown, 1=male, 2=female
total_data$gender <- ifelse(total_data$gender == 0, "Unknown",
                                  ifelse(total_data$gender == 1, "Male", "Female"))

# Seeing the split of genders who rented bikes
total_data %>%
  ggplot(aes(x=gender)) +
  geom_bar()

Subscriber vs Customer for Riders

# Seeing the split of user type who rented bikes
total_data %>%
  ggplot(aes(x=usertype)) +
  geom_bar()

Trip Duration

# Range of all bike rides
total_data <- filter(total_data, tripduration <= 3600)
duration_range <- range(total_data$tripduration, na.rm=TRUE)
duration_range
## [1]   61 3599
# Average length of a bike ride
duration_mean <- mean(total_data$tripduration, na.rm=TRUE)
duration_mean
## [1] 789.341
# Standard deviation of bike rides
duration_sd <- sd(total_data$tripduration, na.rm=TRUE)
duration_sd
## [1] 587.415

Trip Duration with Rain

# Range of all bike rides affected by rain
total_data_rain <- filter(total_data, SNOW == 0, PRCP > 0)
duration_range_rain <- range(total_data_rain$tripduration, na.rm=TRUE)
duration_range_rain
## [1]   61 3598
# Average length of a bike ride affected by rain
duration_mean_rain <- mean(total_data_rain$tripduration, na.rm=TRUE)
duration_mean_rain
## [1] 777.5114
# Standard deviation of bike rides affected by rain
duration_sd_rain <- sd(total_data_rain$tripduration, na.rm=TRUE)
duration_sd_rain
## [1] 575.1325

Trip Duration with Snow

# Range of all bike rides affected by snow
total_data_snow <- filter(total_data, SNOW > 0)
duration_range_snow <- range(total_data_snow$tripduration, na.rm=TRUE)
duration_range_snow
## [1]   62 3548
# Average length of a bike ride affected by snow
duration_mean_snow <- mean(total_data_snow$tripduration, na.rm=TRUE)
duration_mean_snow
## [1] 660.3067
# Standard deviation of bike rides affected by snow
duration_sd_snow <- sd(total_data_snow$tripduration, na.rm=TRUE)
duration_sd_snow
## [1] 525.4768

Trip Duration with Wind

# Range of all bike rides affected by wind
total_data_wind <- filter(total_data, SNOW == 0, PRCP == 0, AWND > 0)
duration_range_wind <- range(total_data_wind$tripduration, na.rm=TRUE)
duration_range_wind
## [1]   61 3599
# Average length of a bike ride affected by wind
duration_mean_wind <- mean(total_data_wind$tripduration, na.rm=TRUE)
duration_mean_wind
## [1] 816.5905
# Standard deviation of bike rides affected by wind
duration_sd_wind <- sd(total_data_wind$tripduration, na.rm=TRUE)
duration_sd_wind
## [1] 601.8395

Types of Weather per Month

# Average rain per month
total_data %>%
  filter(SNOW == 0) %>%
  summarise(average_rain = tapply(PRCP, Month, mean, na.rm=TRUE))
##    average_rain
## 1    0.07898309
## 2    0.06682057
## 3    0.06311462
## 4    0.12180282
## 5    0.13145392
## 6    0.14865237
## 7    0.15799374
## 8    0.10023051
## 9    0.02168239
## 10   0.12288957
## 11   0.03855465
## 12   0.17046460
# Average snow per month
total_data %>% 
  summarise(avg_snow = tapply(SNOW, Month, mean, na.rm=TRUE))
##      avg_snow
## 1  0.03033660
## 2  0.05669672
## 3  0.18865182
## 4  0.00000000
## 5  0.00000000
## 6  0.00000000
## 7  0.00000000
## 8  0.00000000
## 9  0.00000000
## 10 0.00000000
## 11 0.00000000
## 12 0.06747897
# Average wind speed per month
total_data %>%
  summarise(average_wind_speed = tapply(AWND, Month, mean, na.rm=TRUE))
##    average_wind_speed
## 1                 NaN
## 2                 NaN
## 3            4.918059
## 4            4.345938
## 5            3.726322
## 6            4.111195
## 7            3.405346
## 8            3.847932
## 9            4.285026
## 10           5.247129
## 11           5.304712
## 12           6.341162

Exploratory Data Analysis - Weather Effects

Average Rain by Age

# Trip duration by age of riders and rain amount
plot_data <- total_data %>%
  filter(SNOW == 0) %>%
  group_by(age) %>%
  summarise(mean_PRCP_by_age = mean(PRCP),
            mean_duration = mean(tripduration)) 
## `summarise()` ungrouping output (override with `.groups` argument)
plot_data %>%
  ggplot(aes(x = age, y = mean_PRCP_by_age)) +
  geom_point(alpha =0.9, shape = 18, colour = "blue", size = plot_data$mean_duration/150) +
  geom_smooth(colour = "orange") 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Average Wind by Age

# Mean Wind by Age of Rider
total_data %>% 
  group_by(age) %>%
  summarise(mean_AWND_by_age = mean(AWND,na.rm = TRUE)) %>%
  ggplot(aes(x = age, y = mean_AWND_by_age)) + geom_line() + geom_smooth() 
## `summarise()` ungrouping output (override with `.groups` argument)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Rain Effects on Trip Duration

# Average ride time when it's raining
total_data %>%
  filter(PRCP > 0, SNOW == 0) %>%
  summarise(prcp_duration_mean = mean(tripduration))
##   prcp_duration_mean
## 1           777.5114
total_data %>% 
  filter(PRCP > 0, SNOW == 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

total_data %>%
  filter(PRCP > 0, SNOW == 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_density(aes(fill=factor(PRCP)), alpha=0.8)

total_data %>%
  filter(PRCP > 0, SNOW == 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Snow Effects on Trip Duration

# Average ride time when it's snowing
total_data %>%
  filter(SNOW > 0) %>%
  summarise(snow_duration_mean = mean(tripduration))
##   snow_duration_mean
## 1           660.3067
total_data %>% 
  filter(SNOW > 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

total_data %>%
  filter(SNOW > 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_density(aes(fill=factor(SNOW)), alpha=0.8)

total_data %>%
  filter(SNOW > 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Wind Effects on Trip Duration

# Average ride time when it's windy
total_data %>%
  filter(AWND > 0) %>%
  summarise(wind_duration_mean = mean(tripduration))
##   wind_duration_mean
## 1           803.9685
total_data %>% 
  filter(AWND > 0) %>%
  ggplot(aes(x = tripduration)) + 
  geom_histogram(aes(y=..density..), colour="black", fill="white") +
  geom_density(alpha=.2, fill="#FF6666") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

total_data %>%
  filter(AWND > 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_density(aes(fill=factor(AWND)), alpha=0.8)

total_data %>%
  filter(AWND > 0) %>%
  ggplot(aes(x = tripduration)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Weather Effects on Number of Rides over Average Ride Time

# Number of rides over average time sin weather effects
ride_num <- total_data %>%
  filter(tripduration > duration_mean) %>%
  count()
ride_num[1,1]
## [1] 37200
# Number of rides over average time with rain
rain_num <- total_data %>%
  filter(SNOW == 0, PRCP > 0, tripduration > duration_mean) %>%
  count()
rain_num[1,1]
## [1] 12452
# Number of rides over average time with snow
snow_num <- total_data %>%
  filter(SNOW > 0, tripduration > duration_mean) %>%
  count()
snow_num[1,1]
## [1] 503
# Number of rides over average time with wind
wind_num <- total_data %>%
  filter(AWND > 0, tripduration > duration_mean) %>%
  count()
wind_num[1,1]
## [1] 34200

Exploratory Data Analysis - Ride History

Distance Between Stations

# Distance between start and end station in Meters
total_data <- mutate(total_data, 
                            distance = distHaversine(cbind(total_data$start.station.longitude,
                                                           total_data$start.station.latitude),
                                                     cbind(total_data$end.station.longitude,
                                                           total_data$end.station.latitude)))

head(total_data)
##         DATE PRCP SNOW AWND age gender   usertype tripduration
## 1 2019-01-01 0.06    0   NA  52   Male Subscriber         1166
## 2 2019-01-01 0.06    0   NA  33   Male Subscriber          532
## 3 2019-01-01 0.06    0   NA  55   Male Subscriber          263
## 4 2019-01-01 0.06    0   NA  29   Male Subscriber          196
## 5 2019-01-01 0.06    0   NA  28   Male Subscriber          710
## 6 2019-01-01 0.06    0   NA  37 Female Subscriber          312
##   start.station.latitude start.station.longitude start.station.id
## 1               40.72037               -73.96165             3016
## 2               40.67583               -73.95617             3569
## 3               40.74517               -73.98683              474
## 4               40.72308               -73.98584             3656
## 5               40.75187               -73.97771              519
## 6               40.71422               -73.98135              502
##            start.station.name end.station.latitude end.station.longitude
## 1           Kent Ave & N 7 St             40.72080             -73.95485
## 2 Franklin Ave & St Marks Ave             40.69073             -73.95133
## 3             5 Ave & E 29 St             40.74034             -73.98955
## 4           E 2 St & Avenue A             40.72087             -73.98086
## 5       Pershing Square North             40.73222             -73.98166
## 6         Henry St & Grand St             40.72217             -73.98369
##   end.station.id             end.station.name Day Month Year  distance
## 1           3101        N 12 St & Bedford Ave  01    01 2019  576.0106
## 2           3056 Kosciuszko St & Nostrand Ave  01    01 2019 1707.3540
## 3            402           Broadway & E 22 St  01    01 2019  584.0158
## 4            150            E 2 St & Avenue C  01    01 2019  486.4067
## 5            504              1 Ave & E 16 St  01    01 2019 2213.1388
## 6            301            E 2 St & Avenue B  01    01 2019  907.8033

Speed of Rider Demographics

# Speed of the rider
total_data$speed <- total_data$distance/total_data$tripduration

# Average speed of all riders
all_ride <- total_data %>%
  summarise(average_speed = mean(speed))

# Average speed of young riders
young_ride <- total_data %>%
  filter(age <= 45) %>%
  summarise(young_average = mean(speed))

# Average speed of old riders
old_ride <- total_data %>%
  filter(age >= 65) %>%
  summarise(old_average = mean(speed))

# Average speed of female riders
fem_ride <- total_data %>%
  filter(gender == "Female") %>%
  summarise(female_average = mean(speed))

# Average speed of male riders
male_ride <- total_data %>%
  filter(gender == "Male") %>%
  summarise(male_average = mean(speed))

# Average speed of subscribers
sub_ride <- total_data %>%
  filter(usertype == "Customer") %>%
  summarise(customer_average = mean(speed))

# Average speed of customers
cust_ride <- total_data %>%
  filter(usertype == "Subscriber") %>%
  summarise(subscriber_average = mean(speed))

Reduce(merge, list(all_ride,
                   young_ride,
                   old_ride,
                   fem_ride,
                   male_ride,
                   sub_ride,
                   cust_ride))
##   average_speed young_average old_average female_average male_average
## 1      2.462556      2.538826     2.19201       2.326377     2.572124
##   customer_average subscriber_average
## 1         1.801693           2.565832
# Scatter Plot of speed by age
total_data %>%
  ggplot(aes(x = age, y = speed, colour = gender)) +
  geom_point(alpha = .4, size = 1.5) +
  scale_colour_manual(name = 'Gender',
                      values = setNames(c('blue','magenta', 'dark green'),
                                        c('Male', 'Female', 'Unknown'))) +
  geom_smooth(method='lm', colour = 'black') +
  labs(title="Average Speed of Riders by Age", x="Speed", y="Age")
## `geom_smooth()` using formula 'y ~ x'

# Boxplot of speed by gender
total_data %>%
  ggplot(aes(x = gender, y = speed, colour = gender)) +
  geom_boxplot(outlier.colour = 'red') +
  scale_colour_manual(name = 'Gender',
                      values = setNames(c('blue','magenta', 'dark green'),
                                        c('Male', 'Female', 'Unknown'))) +
  labs(title="Speed of Riders by Gender", x="Gender", y="Speed")

# Boxplot of speed by customer type
total_data %>%
  ggplot(aes(x = usertype, y = speed, colour = usertype)) +
  geom_boxplot(outlier.colour = 'red') +
  scale_colour_manual(name = 'User Type',
                      values = setNames(c('purple', 'orange'),
                                        c('Subscriber', 'Customer'))) +
  labs(title="Speed of Riders by Customer Type", x="Customer Type", y="Speed")

Exploratory Data Analysis - Ride History

Start Locations

top_height <- max(total_data$start.station.latitude) - min(total_data$start.station.latitude)
top_width <- max(total_data$start.station.longitude) - min(total_data$start.station.longitude)
top_borders <- c(bottom  = min(total_data$start.station.latitude)  - 0.1 * top_height,
                 top     = max(total_data$start.station.latitude)  + 0.1 * top_height,
                 left    = min(total_data$start.station.longitude) - 0.2 * top_width,
                 right   = max(total_data$start.station.longitude) + 0.2 * top_width)

start <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/12/1205/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1537.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1538.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1539.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1540.png
## Source : http://tile.stamen.com/toner-lite/12/1205/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1206/1541.png
## Source : http://tile.stamen.com/toner-lite/12/1207/1541.png
start_map <- ggmap(start, extent = "device", legend = "topright")

start_map + stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
size = 1, bins = 5, data = total_data,
geom = "polygon"
)

Start Location Preferences - by Day of Week

# convert dates to weekdays
total_data$day_of_week = weekdays(total_data$DATE)

start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = total_data) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ day_of_week)

Start Location Preferences - by Customer Type

start_map +
stat_density2d(
aes(x = start.station.longitude, y = start.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = total_data) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ usertype)

Start Location Preferences - by Trip Duration

## break down by one standard deviation above and below average of trip duration
ggmap(start) +
    geom_point(data = total_data, mapping = aes(x = start.station.longitude, y = start.station.latitude,
                                        col = tripduration)) +
    scale_color_gradient(low = "yellow", high = "red")

End Location Preferences

## before noon and after noon
end <- get_stamenmap(top_borders, zoom = 12, maptype = "toner-lite")
end_map <- ggmap(end, extent = "device", legend = "topright")

end_map + stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
size = 1, bins = 5, data = total_data,
geom = "polygon"
)

End Location Preferences - by User Type

end_map +
stat_density2d(
aes(x = end.station.longitude, y = end.station.latitude, fill = ..level.., alpha = ..level..),
bins = 5, geom = "polygon",
data = total_data) +
scale_fill_gradient(low = "black", high = "red") +
facet_wrap(~ usertype)

Asymmetrical Traffic Data

total_rides = count(total_data)
test = total_data
test$start.station.name = as.character(test$start.station.name)
test$end.station.name = as.character(test$end.station.name)
test <- test[test$start.station.name==test$end.station.name, ]
same_station = count(test)

same_station / total_rides
##            n
## 1 0.01959041

Only ~2.2% of rides start and end at the same station.

start_popularity = sort(table(total_data$start.station.name), decreasing=TRUE)
top10 = round(length(unique(total_data$start.station.name, na.rm=TRUE))*0.1)
top_10 = head(start_popularity, top10)
barplot(top_10)

top_starts = as.data.frame(top_10)
top_10rides = sum(top_starts$Freq)

top_10rides / total_rides
##           n
## 1 0.3290259

32.9% of bike rides start from the top 10% most used stations.

inflow vs outflow (start rides / end rides) in 2019

count_starts = as.data.frame(table(total_data$start.station.name))
names(count_starts) = c("station", "starts")
count_ends = as.data.frame(table(total_data$end.station.name))
names(count_ends) = c("station", "ends")
station_flow = as.data.frame(merge(count_starts, count_ends, by.x="station", by.y="station", all.x=TRUE))
station_flow$net = station_flow$starts - station_flow$ends

station_flow %>% mutate(station = fct_reorder(station, net)) %>% ggplot(aes(x=station, y=net)) + geom_point(stat = "identity")+ geom_hline(yintercept=0, linetype="dashed", color = "red")
## Warning: Removed 15 rows containing missing values (geom_point).